VirusTotal Code
import os
from os import walk
from os.path import join, splitext
import shutil
import multiprocessing as mp
import requests
import time
benignDIR = '/media/saurabh/Seagate Backup Plus Drive/dataset/good/'
malDIR = '/media/saurabh/Seagate Backup Plus Drive/dataset/bad/'
unprocessed = '/media/saurabh/Seagate Backup Plus Drive/dataset/UP/'
tesDirectory = '/media/saurabh/Seagate Backup Plus Drive/chunk1'
keys = ['50e1e02593022c105e9e543324def583bcd488decade12a10b39389b02572e4a', \
'39d024ed2b96bc6b07ce554536a4b173bec912b57a96c15d29ca502aa438971f', \
'0abdb91a391ab0e5eb01d317859d7039b9fc42a4b9ff567d4ccb61dfa861b399', \
'71ad9f63fd6d08ee15b068de7f09eb63f3ee7d002a4e35aa834fac67a05f5e1b', \
'6bcee5048fee69558f0c6b82d2e29dce150d46f414b90f8350ba948ba18976f6', \
'a7258e4ea0c834b14fe9548ed9e6365809f1863c549c5d128f682d6314f3f5fb', \
'bc4c74d1692c8b3d332770c56aa718e202e370532d7c3bfd80f20c3cbc4db6f8', \
'5fa4770f3028f009034a57125cd701053466cb131c25e270c8fed983da35a6af', \
'0e964adb61523a9806d8c4d37c0c2d97fa2d029da8704fbf39f8b7272e01983c', \
'eca814853a84500d5e832c40cb190f5eef871279f6d410a74e51819c9af620fc', \
'33334c817ad04e96ad234e53a25510d7426121bbe6fd8df621acb8518993b481', \
'28f7474010592817cf8519ccd1083b8ffa3db92b141c6ad44d22141a4dddd135', \
'e1c048acdea826a01595786b9efcb771a6b91c0bba7c34be3cedee0e1e67473c', \
'b0e2cfd54310f08af0a089c13878d63643e0abb2dc9036beccc935f5113bc31f', \
'b19e02479cb1acb2d5b490847eb7bac7cc5463948296278214d2f6b3a615a317', \
'adf1b8d54188d4f03733ad30803dcba9ee20deeded9c539810dc35ab78f0c55e', \
'4b4086b4f8eafd44c5db55f9d393ebab308593eef98123253a7802c8c41b32ea', \
'61ee504ea1e921f475db7aa2d59b6ffc833d251c821cdaedc34803607e6a9077']
def virusTotalAPICall(key, apk_hash):
result = 0
url = 'https://www.virustotal.com/vtapi/v2/file/report'
params = {'apikey': key, 'resource': apk_hash}
responseCode = 0
try:
response = requests.get(url, params=params)
#print apk_hash + " :: "
print response
responseCode = response.status_code
print response.json()#['positives']
result = response.json()['positives']
except:
time.sleep(1)
print "Exception for : " + apk_hash
if(responseCode == 204):
result = -2
else:
result = -1
return result
class CheckCount:
def __init__(self):
self.totalFiles = 0
self.completed = 0
def incCount(self):
self.totalFiles +=1
def getTotal(self):
return self.totalFiles
def completedCallback(self, res = ''):
self.completed += 1
print(self.completed, " files Completed out of ", self.totalFiles)
def testAndMove(filePath, fileHash, key):
time.sleep(1)
result = virusTotalAPICall(key, fileHash)
if result == 0:
print fileHash + "--> good"
shutil.move(filePath,benignDIR)
elif result == -1:
print fileHash + "--> not processed"
shutil.move(filePath,unprocessed)
elif result == -2:
print "Limit Exceeded"
return
else:
shutil.move(filePath,malDIR)
print fileHash + "--> bad"
def processAPKs(apk_dir_path):
pool = mp.Pool(3)
ProcessingResults = []
countObj = CheckCount()
for (dirpath, dirnames, filenames) in walk(apk_dir_path):
for filename in filenames:
#print dirpath
apkPath = join(dirpath, filename)
print apkPath
#testAndMove(apkPath, filename[:32], keys[countObj.getTotal() % 18])
ProcessingResults = pool.apply_async(testAndMove, args=(apkPath, filename[:32], keys[countObj.getTotal() % 18]), callback=countObj.completedCallback)
#countObj.completedCallback('abc')
countObj.incCount()
pool.close()
pool.join()
processAPKs(tesDirectory)
APK Reverse Engineering Code
import os
import sys
import subprocess
walk_dir = sys.argv[1]
i = 0
apktool = "C:\\Users\\subha\\OneDrive\\Desktop\\apktool.jar"
for root, subdirs, files in os.walk(walk_dir):
#list_file_path = os.path.join(root, file)
#with open(list_file_path, 'wb') as list_file:
for file in files:
if file.endswith(".apk"):
i=i+1
print str(i)+"-"+file
os.system("java -jar " + apktool + " d " + root+"/"+file)
#Parse Android Manifest.XML
import sys
import csv
import numpy as np
import pandas as pd
import re
import os
import subprocess
from os.path import walk
from os.path import join, splitext
perm = pd.DataFrame(columns=["label","file_name","activity_list", "service_list", "content_provider_list", "broadcast_receiver_list", "intent_filter_list","custom_permissions"])
filecount = 0
def parseData(dirpath, label, count):
for dirpath,dirnames,filenames in os.walk(dirpath):
#print filenames
for file in filenames:
if file.endswith(".ldata"):
#print file
shakes = open(join(dirpath,file),"r")
#print shakes
f = open(join(dirpath,file),"r").read()
#print f
all_permission = []
#all_hardware = []
activity_list = f.count('ActivityList')
#print activity_list
service_list = f.count('ServiceList')
#print service_list
content_provider_list = f.count('ContentProviderList')
#print content_provider_list
broadcast_receiver_list = f.count('BroadcastReceiverList')
#print broadcast_receiver_list
intent_filter_list = f.count('IntentFilterList')
#print intent_filter_list
#pi = #requested permissions
#hi = #requested hardware
#finding RequestedPermissionList in the file
for whole_doc_container in shakes:
#print whole_doc_container #fetching the whole document by a for loop
if "RequestedPermissionList" in whole_doc_container: #finding all RequestedPermissionList in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container
whole_doc_container = whole_doc_container.strip()
if whole_doc_container.startswith("RequestedPermissionList_"):
whole_doc_container = whole_doc_container[32:]
#all_permission.append(whole_doc_container)
if whole_doc_container.startswith("permission"):
all_permission.append(whole_doc_container) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container not in list(perm.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container
perm[whole_doc_container]=0#assigning its value to 0
shakes.close()
#HardwareComponentsList starts
shakes_2 = open(join(dirpath,file),"r")
for whole_doc_container_2 in shakes_2:
#print shakes_2
#print whole_doc_container_2 #fetching the whole document by a for loop
if "HardwareComponentsList" in whole_doc_container_2: #finding all android.permissions in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container_2
whole_doc_container_2 = whole_doc_container_2.strip()
if whole_doc_container_2.startswith("HardwareComponentsList"):
whole_doc_container_2 = whole_doc_container_2[31:]
all_permission.append(whole_doc_container_2) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container_2 not in list(perm.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container_2
perm[whole_doc_container_2]=0#assigning its value to 0
shakes_2.close()
#Custom Permissions Count starts
shakes_3 = open(join(dirpath,file),"r")
custom_permissions = 0
for whole_doc_container_3 in shakes_3:
#print whole_doc_container
if "RequestedPermissionList" in whole_doc_container_3:
whole_doc_container_3 = whole_doc_container_3.strip()
if whole_doc_container_3.startswith("RequestedPermissionList"):
whole_doc_container_3 = whole_doc_container_3[32:]
#print whole_doc_container
if not whole_doc_container_3.startswith("permission"):
#print whole_doc_container
custom_permissions = len(whole_doc_container_3)
#print custom_permissions
data = [label,file, activity_list, service_list, content_provider_list, broadcast_receiver_list, intent_filter_list, custom_permissions] + [1 if p in all_permission else 0 for p in list(perm.columns.values)[8:]] #Assigning all the values to the coloumns. Value of p is 0 or 1. If p exists in all permission then value will be 1. If p does not exist in the coloumn list then 0. 5: signifies it will take these data after the 5th coloumn.
perm.loc[len(perm)] = data #binds the perm and data together
count = count + 1
print count, " files processed"
return count
filecount = parseData('C:\Users\subha\OneDrive\Desktop\IITK Winter\Machine Learning\Dataset\goodware', 1, filecount)
filecount = parseData('C:\Users\subha\OneDrive\Desktop\IITK Winter\Machine Learning\Dataset\malware', 0, filecount)
perm.to_csv('Dataset_Final.csv',index= False) # convert the total data into csv.
#print perm
#print pi
#finding AllUsedhardware in the file
#put all the lists into csv
#np.savetxt("file_name.csv", np.column_stack((data1, data2)), delimiter=",", fmt='%s', header=header)
#df = pd.DataFrame(data={"col1": list_1, "col2": list_2, "col3": list_2, "col4": list_2, "col5": list_2, "col6": list_2,"col7": list_2,"col8": list_2})
#df.to_csv("./file.csv", sep=',',index=False)
'''
'''
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import csv
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from IPython.display import display
filename_datasets = 'Dataset_Final_15000.csv'
df_datasets = pd.read_csv(filename_datasets)
df_datasets = df_datasets.drop(['file_name'],axis=1)
df_datasets.duplicated() #Understand which one is duplicared
df_datasets.drop_duplicates() #Drop Duplicates
#basic plot
#plt.boxplot(df_datasets)
'''#notched plot
plt.figure()
plt.boxplot(df_datasets, 1)
# change outlier point symbols
plt.figure()
plt.boxplot(df_datasets, 0, 'gD')
# don't show outlier points
plt.figure()
plt.boxplot(df_datasets, 0, '')
# horizontal boxes
plt.figure()
plt.boxplot(df_datasets, 0, 'rs', 0)
# change whisker length
plt.figure()
plt.boxplot(df_datasets, 0, 'rs', 0, 0.75)
'''
print(df_datasets.shape)
display(df_datasets.head())
display(df_datasets.describe())
#Heatmap
correlation_matrix = df_datasets.corr()
plt.figure(figsize=(10,8))
print('Heatmap')
ax = sns.heatmap(correlation_matrix, vmax=1, square=True,annot=True,cmap='RdYlGn')
plt.title('Correlation matrix between the features')
plt.show()
def get_train_test(df, y_col, ratio):
mask = np.random.rand(len(df)) < ratio
df_train = df[mask]
df_test = df[~mask]
Y_train = df_train[y_col].values
Y_test = df_test[y_col].values
del df_train[y_col]
del df_test[y_col]
X_train = df_train.values
X_test = df_test.values
return X_train, Y_train, X_test, Y_test
y_col = 'label'
train_test_ratio = 0.7
X_train, Y_train, X_test, Y_test = get_train_test(df_datasets, y_col, train_test_ratio)
dict_classifiers = {
"Logistic_Regression": LogisticRegression(),
"Nearest_Neighbors": KNeighborsClassifier(),
"Linear_SVM": SVC(),
"Gradient_Boosting_Classifier": GradientBoostingClassifier(),
"Decision_Tree": tree.DecisionTreeClassifier(),
"Random_Forest": RandomForestClassifier(n_estimators = 18),
"Neural_Net": MLPClassifier(alpha = 1),
"Naive_Bayes": GaussianNB()
}
no_classifiers = len(dict_classifiers.keys())
def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,8)), columns = ['classifier', 'train_score', 'test_score', 'training_time','true negative','false positive','false negative','true positive'])
count = 0
for key, classifier in dict_classifiers.items():
t_start = time.clock()
classifier.fit(X_train, Y_train)
classifierstored = classifier.predict(X_test)
tn, fp, fn, tp = confusion_matrix(classifierstored, Y_test).ravel()
#save the model to disk
filename = key+"_finalized_model_15000.sav"
joblib.dump(classifier, filename)
t_end = time.clock()
t_diff = t_end - t_start
train_score = classifier.score(X_train, Y_train)
test_score = classifier.score(X_test, Y_test)
df_results.loc[count,'classifier'] = key
df_results.loc[count,'train_score'] = train_score
df_results.loc[count,'test_score'] = test_score
df_results.loc[count,'training_time'] = t_diff
df_results.loc[count,'true negative'] = tn
df_results.loc[count,'false positive'] = fp
df_results.loc[count,'false negative'] = fn
df_results.loc[count,'true positive'] = tp
if verbose:
print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
count+=1
return df_results
df_results = batch_classify(X_train, Y_train, X_test, Y_test)
display(df_results.sort_values(by='test_score', ascending=False))
#load the model from disk
#loaded_model = joblib.load(filename)
#result = loaded_model.score(X_test, Y_test)
#print(result)
import seaborn as sns
t1=[]
t1=df_datasets.columns
print(t1)
for i in range(1,506,1):
sns.boxplot(x = 'label',y = t1[i],data = df_datasets)
plt.legend()
plt.show()
plt.close()
#load the model from disk
#loaded_model = joblib.load('Nearest_Neighbors_finalized_model.sav')
#result = loaded_model.score(X_test, Y_test)
#result
#dataframeb = pd.DataFrame(columns=list(df_datasets.columns.values)[1:])
#dataframeb.to_csv('blankDataFrame.csv', index=False)
testFrame = pd.read_csv('blankDataFrame.csv')
#print (testFrame)
#testFrame = testFrame.drop("label",axis = 1)
read_ldata = open("somemanifestfile.ldata","r").read()
shakes = open("somemanifestfile.ldata","r")
shakes_2 = open("somemanifestfile.ldata","r")
shakes_3 = open("somemanifestfile.ldata","r")
#print (read_ldata)
all_permission = []
activity_list = read_ldata.count('ActivityList')
#print activity_list
service_list = read_ldata.count('ServiceList')
#print service_list
content_provider_list = read_ldata.count('ContentProviderList')
#print content_provider_list
broadcast_receiver_list = read_ldata.count('BroadcastReceiverList')
#print broadcast_receiver_list
intent_filter_list = read_ldata.count('IntentFilterList')
#print intent_filter_list
for whole_doc_container in shakes:
#print whole_doc_container #fetching the whole document by a for loop
if "RequestedPermissionList" in whole_doc_container: #finding all RequestedPermissionList in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container
whole_doc_container = whole_doc_container.strip()
if whole_doc_container.startswith("RequestedPermissionList_"):
whole_doc_container = whole_doc_container[32:]
#all_permission.append(whole_doc_container)
if whole_doc_container.startswith("permission"):
all_permission.append(whole_doc_container) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container not in list(testFrame.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container
testFrame[whole_doc_container]=0#assigning its value to 0
shakes.close()
for whole_doc_container_2 in shakes_2:
#print shakes_2
#print whole_doc_container_2 #fetching the whole document by a for loop
if "HardwareComponentsList" in whole_doc_container_2: #finding all android.permissions in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container_2
whole_doc_container_2 = whole_doc_container_2.strip()
if whole_doc_container_2.startswith("HardwareComponentsList"):
whole_doc_container_2 = whole_doc_container_2[31:]
all_permission.append(whole_doc_container_2) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container_2 not in list(testFrame.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container_2
testFrame[whole_doc_container_2]=0#assigning its value to 0
shakes_2.close()
custom_permissions = 0
for whole_doc_container_3 in shakes_3:
#print whole_doc_container
if "RequestedPermissionList" in whole_doc_container_3:
whole_doc_container_3 = whole_doc_container_3.strip()
if whole_doc_container_3.startswith("RequestedPermissionList"):
whole_doc_container_3 = whole_doc_container_3[32:]
#print whole_doc_container
if not whole_doc_container_3.startswith("permission"):
#print whole_doc_container
custom_permissions = len(whole_doc_container_3)
#print custom_permissions
data = [activity_list, service_list, content_provider_list, broadcast_receiver_list, intent_filter_list, custom_permissions] + [1 if p in all_permission else 0 for p in list(testFrame.columns.values)[6:]]
testFrame.loc[len(testFrame)] = data
#loaded_model.fit(X_test,Y_test)
testFrame
target = []
'''"Logistic_Regression":
"Nearest_Neighbors":
"Linear_SVM":
"Gradient_Boosting_Classifier"
"Decision_Tree":
"Random_Forest":
"Neural_Net":
"Naive_Bayes'''
#load the model from disk
loaded_model = joblib.load('Neural_Net_finalized_model_15000.sav')
loaded_model_1 = joblib.load('Nearest_Neighbors_finalized_model_15000.sav')
loaded_model_2 = joblib.load('Random_Forest_finalized_model_15000.sav')
loaded_model_3 = joblib.load('Naive_Bayes_finalized_model_15000.sav')
loaded_model_4 = joblib.load('Logistic_Regression_finalized_model_15000.sav')
loaded_model_5 = joblib.load('Gradient_Boosting_Classifier_finalized_model_15000.sav')
loaded_model_6 = joblib.load('Decision_Tree_finalized_model_15000.sav')
loaded_model_7 = joblib.load('Linear_SVM_finalized_model_15000.sav')
print ('Neural Net')
result = loaded_model.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target=loaded_model.predict(testFrame)
print(target)
print ('Nearest Neighbours')
result_1 = loaded_model_1.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_1=loaded_model_1.predict(testFrame)
print(target_1)
print ('Random_Forest')
result_2 = loaded_model_2.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_2=loaded_model_2.predict(testFrame)
print(target_2)
print ('Naive_Bayes')
result_3 = loaded_model_3.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_3=loaded_model_3.predict(testFrame)
print(target_3)
print ('Logistic Regression')
result_4 = loaded_model_4.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_4=loaded_model_4.predict(testFrame)
print(target_4)
print ('Gradient Boosting')
result_5 = loaded_model_5.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_5=loaded_model_5.predict(testFrame)
print(target_5)
print ('Decision Tree')
result_6 = loaded_model_6.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_6=loaded_model_6.predict(testFrame)
print(target_6)
print ('Linear SVM')
result_7 = loaded_model_7.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_7=loaded_model_7.predict(testFrame)
print(target_7)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import csv
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from IPython.display import display
filename_datasets = 'Dataset_Final_10800.csv'
df_datasets = pd.read_csv(filename_datasets)
df_datasets = df_datasets.drop(['file_name'],axis=1)
df_datasets.duplicated() #Understand which one is duplicared
df_datasets.drop_duplicates() #Drop Duplicates
#basic plot
#plt.boxplot(df_datasets)
'''#notched plot
plt.figure()
plt.boxplot(df_datasets, 1)
# change outlier point symbols
plt.figure()
plt.boxplot(df_datasets, 0, 'gD')
# don't show outlier points
plt.figure()
plt.boxplot(df_datasets, 0, '')
# horizontal boxes
plt.figure()
plt.boxplot(df_datasets, 0, 'rs', 0)
# change whisker length
plt.figure()
plt.boxplot(df_datasets, 0, 'rs', 0, 0.75)
'''
print(df_datasets.shape)
display(df_datasets.head())
display(df_datasets.describe())
#Heatmap
correlation_matrix = df_datasets.corr()
plt.figure(figsize=(10,8))
print('Heatmap')
ax = sns.heatmap(correlation_matrix, vmax=1, square=True,annot=True,cmap='RdYlGn')
plt.title('Correlation matrix between the features')
plt.show()
def get_train_test(df, y_col, ratio):
mask = np.random.rand(len(df)) < ratio
df_train = df[mask]
df_test = df[~mask]
Y_train = df_train[y_col].values
Y_test = df_test[y_col].values
del df_train[y_col]
del df_test[y_col]
X_train = df_train.values
X_test = df_test.values
return X_train, Y_train, X_test, Y_test
y_col = 'label'
train_test_ratio = 0.7
X_train, Y_train, X_test, Y_test = get_train_test(df_datasets, y_col, train_test_ratio)
dict_classifiers = {
"Logistic_Regression": LogisticRegression(),
"Nearest_Neighbors": KNeighborsClassifier(),
"Linear_SVM": SVC(),
"Gradient_Boosting_Classifier": GradientBoostingClassifier(),
"Decision_Tree": tree.DecisionTreeClassifier(),
"Random_Forest": RandomForestClassifier(n_estimators = 18),
"Neural_Net": MLPClassifier(alpha = 1),
"Naive_Bayes": GaussianNB()
}
no_classifiers = len(dict_classifiers.keys())
def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,8)), columns = ['classifier', 'train_score', 'test_score', 'training_time','true negative','false positive','false negative','true positive'])
count = 0
for key, classifier in dict_classifiers.items():
t_start = time.clock()
classifier.fit(X_train, Y_train)
classifierstored = classifier.predict(X_test)
tn, fp, fn, tp = confusion_matrix(classifierstored, Y_test).ravel()
#save the model to disk
filename = key+"_finalized_model_10800.sav"
joblib.dump(classifier, filename)
t_end = time.clock()
t_diff = t_end - t_start
train_score = classifier.score(X_train, Y_train)
test_score = classifier.score(X_test, Y_test)
df_results.loc[count,'classifier'] = key
df_results.loc[count,'train_score'] = train_score
df_results.loc[count,'test_score'] = test_score
df_results.loc[count,'training_time'] = t_diff
df_results.loc[count,'true negative'] = tn
df_results.loc[count,'false positive'] = fp
df_results.loc[count,'false negative'] = fn
df_results.loc[count,'true positive'] = tp
if verbose:
print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
count+=1
return df_results
df_results = batch_classify(X_train, Y_train, X_test, Y_test)
display(df_results.sort_values(by='test_score', ascending=False))
#load the model from disk
#loaded_model = joblib.load(filename)
#result = loaded_model.score(X_test, Y_test)
#print(result)
#BoxPlot
import seaborn as sns
t1=[]
t1=df_datasets.columns
print(t1)
for i in range(1,506,1):
sns.boxplot(x = 'label',y = t1[i],data = df_datasets)
plt.legend()
plt.show()
plt.close()
#dataframeb = pd.DataFrame(columns=list(df_datasets.columns.values)[1:])
#dataframeb.to_csv('blankDataFrame.csv', index=False)
testFrame = pd.read_csv('blankDataFrame.csv')
#print (testFrame)
#testFrame = testFrame.drop("label",axis = 1)
read_ldata = open("somemanifestfile.ldata","r").read()
shakes = open("somemanifestfile.ldata","r")
shakes_2 = open("somemanifestfile.ldata","r")
shakes_3 = open("somemanifestfile.ldata","r")
#print (read_ldata)
all_permission = []
activity_list = read_ldata.count('ActivityList')
#print activity_list
service_list = read_ldata.count('ServiceList')
#print service_list
content_provider_list = read_ldata.count('ContentProviderList')
#print content_provider_list
broadcast_receiver_list = read_ldata.count('BroadcastReceiverList')
#print broadcast_receiver_list
intent_filter_list = read_ldata.count('IntentFilterList')
#print intent_filter_list
for whole_doc_container in shakes:
#print whole_doc_container #fetching the whole document by a for loop
if "RequestedPermissionList" in whole_doc_container: #finding all RequestedPermissionList in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container
whole_doc_container = whole_doc_container.strip()
if whole_doc_container.startswith("RequestedPermissionList_"):
whole_doc_container = whole_doc_container[32:]
#all_permission.append(whole_doc_container)
if whole_doc_container.startswith("permission"):
all_permission.append(whole_doc_container) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container not in list(testFrame.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container
testFrame[whole_doc_container]=0#assigning its value to 0
shakes.close()
for whole_doc_container_2 in shakes_2:
#print shakes_2
#print whole_doc_container_2 #fetching the whole document by a for loop
if "HardwareComponentsList" in whole_doc_container_2: #finding all android.permissions in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container_2
whole_doc_container_2 = whole_doc_container_2.strip()
if whole_doc_container_2.startswith("HardwareComponentsList"):
whole_doc_container_2 = whole_doc_container_2[31:]
all_permission.append(whole_doc_container_2) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container_2 not in list(testFrame.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container_2
testFrame[whole_doc_container_2]=0#assigning its value to 0
shakes_2.close()
custom_permissions = 0
for whole_doc_container_3 in shakes_3:
#print whole_doc_container
if "RequestedPermissionList" in whole_doc_container_3:
whole_doc_container_3 = whole_doc_container_3.strip()
if whole_doc_container_3.startswith("RequestedPermissionList"):
whole_doc_container_3 = whole_doc_container_3[32:]
#print whole_doc_container
if not whole_doc_container_3.startswith("permission"):
#print whole_doc_container
custom_permissions = len(whole_doc_container_3)
#print custom_permissions
data = [activity_list, service_list, content_provider_list, broadcast_receiver_list, intent_filter_list, custom_permissions] + [1 if p in all_permission else 0 for p in list(testFrame.columns.values)[6:]]
testFrame.loc[len(testFrame)] = data
#loaded_model.fit(X_test,Y_test)
testFrame
target = []
'''"Logistic_Regression":
"Nearest_Neighbors":
"Linear_SVM":
"Gradient_Boosting_Classifier"
"Decision_Tree":
"Random_Forest":
"Neural_Net":
"Naive_Bayes'''
#load the model from disk
loaded_model = joblib.load('Neural_Net_finalized_model_10800.sav')
loaded_model_1 = joblib.load('Nearest_Neighbors_finalized_model_10800.sav')
loaded_model_2 = joblib.load('Random_Forest_finalized_model_10800.sav')
loaded_model_3 = joblib.load('Naive_Bayes_finalized_model_10800.sav')
loaded_model_4 = joblib.load('Logistic_Regression_finalized_model_10800.sav')
loaded_model_5 = joblib.load('Gradient_Boosting_Classifier_finalized_model_10800.sav')
loaded_model_6 = joblib.load('Decision_Tree_finalized_model_10800.sav')
loaded_model_7 = joblib.load('Linear_SVM_finalized_model_10800.sav')
print ('Neural Net')
result = loaded_model.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target=loaded_model.predict(testFrame)
print(target)
print ('Nearest Neighbours')
result_1 = loaded_model_1.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_1=loaded_model_1.predict(testFrame)
print(target_1)
print ('Random_Forest')
result_2 = loaded_model_2.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_2=loaded_model_2.predict(testFrame)
print(target_2)
print ('Naive_Bayes')
result_3 = loaded_model_3.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_3=loaded_model_3.predict(testFrame)
print(target_3)
print ('Logistic Regression')
result_4 = loaded_model_4.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_4=loaded_model_4.predict(testFrame)
print(target_4)
print ('Gradient Boosting')
result_5 = loaded_model_5.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_5=loaded_model_5.predict(testFrame)
print(target_5)
print ('Decision Tree')
result_6 = loaded_model_6.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_6=loaded_model_6.predict(testFrame)
print(target_6)
print ('Linear SVM')
result_7 = loaded_model_7.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_7=loaded_model_7.predict(testFrame)
print(target_7)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import csv
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from IPython.display import display
filename_datasets = 'Dataset_Final_10000.csv'
df_datasets = pd.read_csv(filename_datasets)
df_datasets = df_datasets.drop(['file_name'],axis=1)
df_datasets.duplicated() #Understand which one is duplicared
df_datasets.drop_duplicates() #Drop Duplicates
#basic plot
#plt.boxplot(df_datasets)
'''#notched plot
plt.figure()
plt.boxplot(df_datasets, 1)
# change outlier point symbols
plt.figure()
plt.boxplot(df_datasets, 0, 'gD')
# don't show outlier points
plt.figure()
plt.boxplot(df_datasets, 0, '')
# horizontal boxes
plt.figure()
plt.boxplot(df_datasets, 0, 'rs', 0)
# change whisker length
plt.figure()
plt.boxplot(df_datasets, 0, 'rs', 0, 0.75)
'''
print(df_datasets.shape)
display(df_datasets.head())
display(df_datasets.describe())
#Heatmap
correlation_matrix = df_datasets.corr()
plt.figure(figsize=(10,8))
print('Heatmap')
ax = sns.heatmap(correlation_matrix, vmax=1, square=True,annot=True,cmap='RdYlGn')
plt.title('Correlation matrix between the features')
plt.show()
def get_train_test(df, y_col, ratio):
mask = np.random.rand(len(df)) < ratio
df_train = df[mask]
df_test = df[~mask]
Y_train = df_train[y_col].values
Y_test = df_test[y_col].values
del df_train[y_col]
del df_test[y_col]
X_train = df_train.values
X_test = df_test.values
return X_train, Y_train, X_test, Y_test
y_col = 'label'
train_test_ratio = 0.7
X_train, Y_train, X_test, Y_test = get_train_test(df_datasets, y_col, train_test_ratio)
dict_classifiers = {
"Logistic_Regression": LogisticRegression(),
"Nearest_Neighbors": KNeighborsClassifier(),
"Linear_SVM": SVC(),
"Gradient_Boosting_Classifier": GradientBoostingClassifier(),
"Decision_Tree": tree.DecisionTreeClassifier(),
"Random_Forest": RandomForestClassifier(n_estimators = 18),
"Neural_Net": MLPClassifier(alpha = 1),
"Naive_Bayes": GaussianNB()
}
no_classifiers = len(dict_classifiers.keys())
def batch_classify(X_train, Y_train, X_test, Y_test, verbose = True):
df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,8)), columns = ['classifier', 'train_score', 'test_score', 'training_time','true negative','false positive','false negative','true positive'])
count = 0
for key, classifier in dict_classifiers.items():
t_start = time.clock()
classifier.fit(X_train, Y_train)
classifierstored = classifier.predict(X_test)
tn, fp, fn, tp = confusion_matrix(classifierstored, Y_test).ravel()
#save the model to disk
filename = key+"_finalized_model_10000.sav"
joblib.dump(classifier, filename)
t_end = time.clock()
t_diff = t_end - t_start
train_score = classifier.score(X_train, Y_train)
test_score = classifier.score(X_test, Y_test)
df_results.loc[count,'classifier'] = key
df_results.loc[count,'train_score'] = train_score
df_results.loc[count,'test_score'] = test_score
df_results.loc[count,'training_time'] = t_diff
df_results.loc[count,'true negative'] = tn
df_results.loc[count,'false positive'] = fp
df_results.loc[count,'false negative'] = fn
df_results.loc[count,'true positive'] = tp
if verbose:
print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
count+=1
return df_results
df_results = batch_classify(X_train, Y_train, X_test, Y_test)
display(df_results.sort_values(by='test_score', ascending=False))
#load the model from disk
#loaded_model = joblib.load(filename)
#result = loaded_model.score(X_test, Y_test)
#print(result)
#BoxPlot
import seaborn as sns
t1=[]
t1=df_datasets.columns
print(t1)
for i in range(1,506,1):
sns.boxplot(x = 'label',y = t1[i],data = df_datasets)
plt.legend()
plt.show()
plt.close()
#dataframeb = pd.DataFrame(columns=list(df_datasets.columns.values)[1:])
#dataframeb.to_csv('blankDataFrame.csv', index=False)
testFrame = pd.read_csv('blankDataFrame.csv')
#print (testFrame)
#testFrame = testFrame.drop("label",axis = 1)
read_ldata = open("somemanifestfile.ldata","r").read()
shakes = open("somemanifestfile.ldata","r")
shakes_2 = open("somemanifestfile.ldata","r")
shakes_3 = open("somemanifestfile.ldata","r")
#print (read_ldata)
all_permission = []
activity_list = read_ldata.count('ActivityList')
#print activity_list
service_list = read_ldata.count('ServiceList')
#print service_list
content_provider_list = read_ldata.count('ContentProviderList')
#print content_provider_list
broadcast_receiver_list = read_ldata.count('BroadcastReceiverList')
#print broadcast_receiver_list
intent_filter_list = read_ldata.count('IntentFilterList')
#print intent_filter_list
for whole_doc_container in shakes:
#print whole_doc_container #fetching the whole document by a for loop
if "RequestedPermissionList" in whole_doc_container: #finding all RequestedPermissionList in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container
whole_doc_container = whole_doc_container.strip()
if whole_doc_container.startswith("RequestedPermissionList_"):
whole_doc_container = whole_doc_container[32:]
#all_permission.append(whole_doc_container)
if whole_doc_container.startswith("permission"):
all_permission.append(whole_doc_container) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container not in list(testFrame.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container
testFrame[whole_doc_container]=0#assigning its value to 0
shakes.close()
for whole_doc_container_2 in shakes_2:
#print shakes_2
#print whole_doc_container_2 #fetching the whole document by a for loop
if "HardwareComponentsList" in whole_doc_container_2: #finding all android.permissions in the whole document. Another way is re.match("RequestedPermissionList",whole_doc_container):
#print whole_doc_container_2
whole_doc_container_2 = whole_doc_container_2.strip()
if whole_doc_container_2.startswith("HardwareComponentsList"):
whole_doc_container_2 = whole_doc_container_2[31:]
all_permission.append(whole_doc_container_2) #adding the android.permission values in the list all_permission. the coloumns are added AUTOMATICALLY. WHERE IT IS ADDED AUTOMATICALLY??
if whole_doc_container_2 not in list(testFrame.columns.values):#checking which android.permission is not in the coloumns
#print whole_doc_container_2
testFrame[whole_doc_container_2]=0#assigning its value to 0
shakes_2.close()
custom_permissions = 0
for whole_doc_container_3 in shakes_3:
#print whole_doc_container
if "RequestedPermissionList" in whole_doc_container_3:
whole_doc_container_3 = whole_doc_container_3.strip()
if whole_doc_container_3.startswith("RequestedPermissionList"):
whole_doc_container_3 = whole_doc_container_3[32:]
#print whole_doc_container
if not whole_doc_container_3.startswith("permission"):
#print whole_doc_container
custom_permissions = len(whole_doc_container_3)
#print custom_permissions
data = [activity_list, service_list, content_provider_list, broadcast_receiver_list, intent_filter_list, custom_permissions] + [1 if p in all_permission else 0 for p in list(testFrame.columns.values)[6:]]
testFrame.loc[len(testFrame)] = data
#loaded_model.fit(X_test,Y_test)
testFrame
target = []
'''"Logistic_Regression":
"Nearest_Neighbors":
"Linear_SVM":
"Gradient_Boosting_Classifier"
"Decision_Tree":
"Random_Forest":
"Neural_Net":
"Naive_Bayes'''
#load the model from disk
loaded_model = joblib.load('Neural_Net_finalized_model_10000.sav')
loaded_model_1 = joblib.load('Nearest_Neighbors_finalized_model_10000.sav')
loaded_model_2 = joblib.load('Random_Forest_finalized_model_10000.sav')
loaded_model_3 = joblib.load('Naive_Bayes_finalized_model_10000.sav')
loaded_model_4 = joblib.load('Logistic_Regression_finalized_model_10000.sav')
loaded_model_5 = joblib.load('Gradient_Boosting_Classifier_finalized_model_10000.sav')
loaded_model_6 = joblib.load('Decision_Tree_finalized_model_10000.sav')
loaded_model_7 = joblib.load('Linear_SVM_finalized_model_10000.sav')
print ('Neural Net')
result = loaded_model.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target=loaded_model.predict(testFrame)
print(target)
print ('Nearest Neighbours')
result_1 = loaded_model_1.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_1=loaded_model_1.predict(testFrame)
print(target_1)
print ('Random_Forest')
result_2 = loaded_model_2.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_2=loaded_model_2.predict(testFrame)
print(target_2)
print ('Naive_Bayes')
result_3 = loaded_model_3.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_3=loaded_model_3.predict(testFrame)
print(target_3)
print ('Logistic Regression')
result_4 = loaded_model_4.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_4=loaded_model_4.predict(testFrame)
print(target_4)
print ('Gradient Boosting')
result_5 = loaded_model_5.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_5=loaded_model_5.predict(testFrame)
print(target_5)
print ('Decision Tree')
result_6 = loaded_model_6.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_6=loaded_model_6.predict(testFrame)
print(target_6)
print ('Linear SVM')
result_7 = loaded_model_7.score(X_test, Y_test)
#loaded_model.fit(train_data, target)
#testFrame=testFrame.drop(['label'])
target_7=loaded_model_7.predict(testFrame)
print(target_7)